Loading packages and data
# data preprocessing
library(tidyverse)
library(tidytext)
library(lubridate)
library(tm)
library(SnowballC)
library(wordcloud)
library(qdapDictionaries)
library(reshape2)
# data exploration
library(summarytools) # for user-friendly html summaries of data
library(ggmap) # for plotting data on a map
library(hrbrthemes)
library(showtext)
library(usmap)
showtext_auto()
# directly from google fonts
sysfonts::font_add_google("Roboto Condensed")
# set some global options
options(dplyr.width = Inf)
theme_set(theme_ipsum_rc())
# Load speeches
speeches <-
list.files(path = "data/processed/",
pattern = "*.csv",
full.names = TRUE) %>%
readr::read_csv(
id = "file_name",
col_types = cols(
speech_id = col_character(),
speakerid = col_character(),
district = col_character()
)
) %>% mutate(
chamber = factor(chamber),
gender = factor(gender),
party = factor(party),
nonvoting = factor(nonvoting),
session = as.integer(gsub(".*?([0-9]+).*", "\\1", file_name)),
file_name = NULL
) %>% mutate(party = recode(party, D = 'Democratic', R = 'Republican'),
name = paste(firstname, lastname)
) %>% filter(!is.na(speech))
# Remove duplicates
speeches <- speeches %>% group_by(speech) %>% filter(n() == 1) %>% ungroup()
# Add word counts for each speech
speeches <- mutate(speeches, word_count = str_count(speech ,"\\W+"))
# Display statistics (not visible in browser)
speeches %>% dfSummary %>% view()
## Switching method to 'browser'
## Output file written: C:\Users\dmitl\AppData\Local\Temp\RtmpgbSdGT\file417c6ae15523.html
speeches